<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=2">
<meta name="theme-color" content="#222">
<meta name="generator" content="Hexo 4.2.0">
  <link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon-32x32-next.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon-16x16-next.png">
  <link rel="mask-icon" href="/images/logo.svg" color="#222">
  <link rel="manifest" href="/images/manifest.json">
  <meta name="msapplication-config" content="/images/browserconfig.xml">

<link rel="stylesheet" href="/css/main.css">


<link rel="stylesheet" href="/lib/font-awesome/css/font-awesome.min.css">
  <link rel="stylesheet" href="//cdn.jsdelivr.net/npm/pace-js@1/themes/blue/pace-theme-minimal.css">
  <script src="//cdn.jsdelivr.net/npm/pace-js@1/pace.min.js"></script>

<script id="hexo-configurations">
    var NexT = window.NexT || {};
    var CONFIG = {"hostname":"daydaychen.github.io","root":"/","scheme":"Gemini","version":"7.7.2","exturl":false,"sidebar":{"position":"right","display":"post","padding":18,"offset":12,"onmobile":false},"copycode":{"enable":true,"show_result":true,"style":"mac"},"back2top":{"enable":true,"sidebar":true,"scrollpercent":true},"bookmark":{"enable":false,"color":"#222","save":"auto"},"fancybox":false,"mediumzoom":true,"lazyload":true,"pangu":true,"comments":{"style":"tabs","active":"valine","storage":true,"lazyload":true,"nav":null,"activeClass":"valine"},"algolia":{"hits":{"per_page":10},"labels":{"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}},"localsearch":{"enable":true,"trigger":"auto","top_n_per_article":1,"unescape":true,"preload":false},"motion":{"enable":true,"async":true,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},"path":"search.xml"};
  </script>

  <meta name="description" content="使用python中的re模块处理YouTube字幕文件状况：18个字幕文件，编码格式Window-1254，命名规律，按数字排序 需求：将字幕文件中的演讲词提取出来，剔除无用的字符，并合并为一个文件 文件命名格式1234567891011121314151617181920ls tmp&#x2F;total 84K-rw-r--r-- 1 root root 3.1K Apr 22 14:50 0">
<meta property="og:type" content="article">
<meta property="og:title" content="使用正则处理字幕文件">
<meta property="og:url" content="daydaychen.github.io/2020/03/06/%E4%BD%BF%E7%94%A8%E6%AD%A3%E5%88%99%E5%A4%84%E7%90%86%E5%AD%97%E5%B9%95%E6%96%87%E4%BB%B6%EF%BC%8Cpython/index.html">
<meta property="og:site_name" content="宁静致远">
<meta property="og:description" content="使用python中的re模块处理YouTube字幕文件状况：18个字幕文件，编码格式Window-1254，命名规律，按数字排序 需求：将字幕文件中的演讲词提取出来，剔除无用的字符，并合并为一个文件 文件命名格式1234567891011121314151617181920ls tmp&#x2F;total 84K-rw-r--r-- 1 root root 3.1K Apr 22 14:50 0">
<meta property="og:locale" content="zh_CN">
<meta property="article:published_time" content="2020-03-06T06:13:49.912Z">
<meta property="article:modified_time" content="2020-03-06T06:13:49.912Z">
<meta property="article:author" content="CT">
<meta name="twitter:card" content="summary">

<link rel="canonical" href="daydaychen.github.io/2020/03/06/%E4%BD%BF%E7%94%A8%E6%AD%A3%E5%88%99%E5%A4%84%E7%90%86%E5%AD%97%E5%B9%95%E6%96%87%E4%BB%B6%EF%BC%8Cpython/">


<script id="page-configurations">
  // https://hexo.io/docs/variables.html
  CONFIG.page = {
    sidebar: "",
    isHome : false,
    isPost : true
  };
</script>

  <title>使用正则处理字幕文件 | 宁静致远</title>
  






  <noscript>
  <style>
  .use-motion .brand,
  .use-motion .menu-item,
  .sidebar-inner,
  .use-motion .post-block,
  .use-motion .pagination,
  .use-motion .comments,
  .use-motion .post-header,
  .use-motion .post-body,
  .use-motion .collection-header { opacity: initial; }

  .use-motion .site-title,
  .use-motion .site-subtitle {
    opacity: initial;
    top: initial;
  }

  .use-motion .logo-line-before i { left: initial; }
  .use-motion .logo-line-after i { right: initial; }
  </style>
</noscript>

</head>

<body itemscope itemtype="http://schema.org/WebPage">
  <div class="container use-motion">
    <div class="headband"></div>

    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="切换导航栏">
      <span class="toggle-line toggle-line-first"></span>
      <span class="toggle-line toggle-line-middle"></span>
      <span class="toggle-line toggle-line-last"></span>
    </div>
  </div>

  <div class="site-meta">

    <div>
      <a href="/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">宁静致远</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
        <p class="site-subtitle">this is a subtitle.</p>
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger">
        <i class="fa fa-search fa-fw fa-lg"></i>
    </div>
  </div>
</div>


<nav class="site-nav">
  
  <ul id="menu" class="menu">
        <li class="menu-item menu-item-home">

    <a href="/" rel="section"><i class="fa fa-fw fa-home"></i>首页</a>

  </li>
        <li class="menu-item menu-item-about">

    <a href="/about/" rel="section"><i class="fa fa-fw fa-user"></i>关于</a>

  </li>
        <li class="menu-item menu-item-tags">

    <a href="/tags/" rel="section"><i class="fa fa-fw fa-tags"></i>标签</a>

  </li>
        <li class="menu-item menu-item-categories">

    <a href="/categories/" rel="section"><i class="fa fa-fw fa-th"></i>分类</a>

  </li>
        <li class="menu-item menu-item-archives">

    <a href="/archives/" rel="section"><i class="fa fa-fw fa-archive"></i>归档</a>

  </li>
      <li class="menu-item menu-item-search">
        <a role="button" class="popup-trigger"><i class="fa fa-search fa-fw"></i>搜索
        </a>
      </li>
  </ul>

</nav>
  <div class="site-search">
    <div class="popup search-popup">
    <div class="search-header">
  <span class="search-icon">
    <i class="fa fa-search"></i>
  </span>
  <div class="search-input-container">
    <input autocomplete="off" autocorrect="off" autocapitalize="off"
           placeholder="搜索..." spellcheck="false"
           type="search" class="search-input">
  </div>
  <span class="popup-btn-close">
    <i class="fa fa-times-circle"></i>
  </span>
</div>
<div id="search-result"></div>

</div>
<div class="search-pop-overlay"></div>

  </div>
</div>
    </header>

    


    <main class="main">
      <div class="main-inner">
        <div class="content-wrap">
          

          <div class="content">
            

  <div class="posts-expand">
      
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-block " lang="zh-CN">
    <link itemprop="mainEntityOfPage" href="daydaychen.github.io/2020/03/06/%E4%BD%BF%E7%94%A8%E6%AD%A3%E5%88%99%E5%A4%84%E7%90%86%E5%AD%97%E5%B9%95%E6%96%87%E4%BB%B6%EF%BC%8Cpython/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar.gif">
      <meta itemprop="name" content="CT">
      <meta itemprop="description" content="this is a desc.">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="宁静致远">
    </span>
      <header class="post-header">
        <h1 class="post-title" itemprop="name headline">
          使用正则处理字幕文件
        </h1>

        <div class="post-meta">
            <span class="post-meta-item">
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              <span class="post-meta-item-text">发表于</span>

              <time title="创建时间：2020-03-06 14:13:49" itemprop="dateCreated datePublished" datetime="2020-03-06T14:13:49+08:00">2020-03-06</time>
            </span>

          
            <span id="/2020/03/06/%E4%BD%BF%E7%94%A8%E6%AD%A3%E5%88%99%E5%A4%84%E7%90%86%E5%AD%97%E5%B9%95%E6%96%87%E4%BB%B6%EF%BC%8Cpython/" class="post-meta-item leancloud_visitors" data-flag-title="使用正则处理字幕文件" title="阅读次数">
              <span class="post-meta-item-icon">
                <i class="fa fa-eye"></i>
              </span>
              <span class="post-meta-item-text">阅读次数：</span>
              <span class="leancloud-visitors-count"></span>
            </span>
  
  <span class="post-meta-item">
    
      <span class="post-meta-item-icon">
        <i class="fa fa-comment-o"></i>
      </span>
      <span class="post-meta-item-text">Valine：</span>
    
    <a title="valine" href="/2020/03/06/%E4%BD%BF%E7%94%A8%E6%AD%A3%E5%88%99%E5%A4%84%E7%90%86%E5%AD%97%E5%B9%95%E6%96%87%E4%BB%B6%EF%BC%8Cpython/#valine-comments" itemprop="discussionUrl">
      <span class="post-comments-count valine-comment-count" data-xid="/2020/03/06/%E4%BD%BF%E7%94%A8%E6%AD%A3%E5%88%99%E5%A4%84%E7%90%86%E5%AD%97%E5%B9%95%E6%96%87%E4%BB%B6%EF%BC%8Cpython/" itemprop="commentCount"></span>
    </a>
  </span>
  
  

        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">

      
        <h3 id="使用python中的re模块处理YouTube字幕文件"><a href="#使用python中的re模块处理YouTube字幕文件" class="headerlink" title="使用python中的re模块处理YouTube字幕文件"></a>使用python中的re模块处理YouTube字幕文件</h3><p>状况：18个字幕文件，编码格式Window-1254，命名规律，按数字排序</p>
<p>需求：将字幕文件中的演讲词提取出来，剔除无用的字符，并合并为一个文件</p>
<h4 id="文件命名格式"><a href="#文件命名格式" class="headerlink" title="文件命名格式"></a>文件命名格式</h4><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br></pre></td><td class="code"><pre><span class="line">ls tmp&#x2F;</span><br><span class="line">total 84K</span><br><span class="line">-rw-r--r-- 1 root root 3.1K Apr 22 14:50 02-Odyssey_Plans__The_Stages_of_Life.srt</span><br><span class="line">-rw-r--r-- 1 root root 4.6K Apr 22 14:51 03-Odyssey_Plans__What_is_an_Odyssey_Plan_.srt</span><br><span class="line">-rw-r--r-- 1 root root 2.5K Apr 22 14:51 04-Odyssey_Plans__What_does_an_Odyssey_Plan_Include_.srt</span><br><span class="line">-rw-r--r-- 1 root root 2.9K Apr 22 14:52 05-Odyssey_Plans__Presentation_Format.srt</span><br><span class="line">-rw-r--r-- 1 root root 1.5K Apr 22 14:52 06-Odyssey_Plans__5-Year_Timelines.srt</span><br><span class="line">-rw-r--r-- 1 root root 1.2K Apr 22 14:53 07-Odyssey_Plans__6-Word_Title.srt</span><br><span class="line">-rw-r--r-- 1 root root 2.0K Apr 22 14:53 08-Odyssey_Plans__Designing_3_Timelines.srt</span><br><span class="line">-rw-r--r-- 1 root root 1.9K Apr 22 14:54 09-Odyssey_Plans__Building_your_10-year_timeline.srt</span><br><span class="line">-rw-r--r-- 1 root root 1.3K Apr 22 14:54 10-Odyssey_Plans__Choosing_a_Symbol.srt</span><br><span class="line">-rw-r--r-- 1 root root 3.2K Apr 22 14:54 11-Odyssey_Plans__Creating_a_Dashboard.srt</span><br><span class="line">-rw-r--r-- 1 root root 1013 Apr 22 14:54 12-Odyssey_Plans__Identifying_Questions.srt</span><br><span class="line">-rw-r--r-- 1 root root 1.6K Apr 22 14:55 13-Odyssey_Plans__Writing_a_Thank-You_note.srt</span><br><span class="line">-rw-r--r-- 1 root root 2.8K Apr 22 14:55 14-Odyssey_Plans__How_to_&#39;Prototype&#39;_your_Odysseys.srt</span><br><span class="line">-rw-r--r-- 1 root root 4.7K Apr 22 14:55 15-Odyssey_Plans__Prototype_Conversations_and_Experiences.srt</span><br><span class="line">-rw-r--r-- 1 root root 3.7K Apr 22 14:55 16-Odyssey_Plans__How_often_to_design_an_Odyssey_plan.srt</span><br><span class="line">-rw-r--r-- 1 root root 3.8K Apr 22 14:56 17-Odyssey_Plans__Insights_and_Takeaways.srt</span><br><span class="line">-rw-r--r-- 1 root root 3.2K Apr 24 19:55 01-Odyssey_Plans__What_are_the_Odyssey_Years_.srt</span><br><span class="line">-rw-r--r-- 1 root root 5.8K Apr 24 20:05 18-Odyssey_Plans__Applying_Designers&#39;_Mindsets.srt</span><br></pre></td></tr></table></figure>
<h4 id="文件内容格式"><a href="#文件内容格式" class="headerlink" title="文件内容格式"></a>文件内容格式</h4><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br></pre></td><td class="code"><pre><span class="line">1</span><br><span class="line">00:00:03,590 --&gt; 00:00:04,710</span><br><span class="line">My name is Bill Burnett.</span><br><span class="line"></span><br><span class="line">2</span><br><span class="line">00:00:04,710 --&gt; 00:00:07,460</span><br><span class="line">I&#39;m one of the co-authors</span><br><span class="line">of Designing Your Life,</span><br><span class="line"></span><br><span class="line">3</span><br><span class="line">00:00:07,460 --&gt; 00:00:09,410</span><br><span class="line">How to Live a</span><br><span class="line">Well-Lived Joyful Life.</span><br><span class="line"></span><br><span class="line">......</span><br></pre></td></tr></table></figure>
<h4 id="思路分析"><a href="#思路分析" class="headerlink" title="思路分析"></a>思路分析</h4><ul>
<li>获取所有文件名存入列表  &lt;–  方便循环进行文件操作</li>
<li>然后用正则匹配演讲词并打印出来  &lt;– 剔除无用字符</li>
<li>执行脚本并将输出重定向到一个新文件 &lt;– 合并为一个文件</li>
</ul>
<h4 id="实现代码"><a href="#实现代码" class="headerlink" title="实现代码"></a>实现代码</h4><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br></pre></td><td class="code"><pre><span class="line"># Script_name: merge_final.py</span><br><span class="line"></span><br><span class="line">import re</span><br><span class="line">import os</span><br><span class="line"></span><br><span class="line"># 字幕文件所在目录</span><br><span class="line">dir &#x3D; &quot;&#x2F;root&#x2F;tmp&#x2F;&quot;</span><br><span class="line"></span><br><span class="line">def process(filename):</span><br><span class="line">        &#39;&#39;&#39;对文件进行操作&#39;&#39;&#39;</span><br><span class="line">        print(filename)</span><br><span class="line">        </span><br><span class="line">        with open(dir+filename, &#39;r&#39;, encode&#x3D;&#39;utf-8&#39;) as f:       # 打开文件</span><br><span class="line">                lines &#x3D; f.readlines()       # 按行读取整个文件，存入列表</span><br><span class="line">                </span><br><span class="line">                for line in lines:</span><br><span class="line">                        # 剔除无用字符</span><br><span class="line">                        lrc &#x3D; re.search(r&#39;^\D.*[a-z.]&#39;, line) </span><br><span class="line">                        if lrc: </span><br><span class="line">                                print(lrc.group()) </span><br><span class="line">        </span><br><span class="line">        print(&quot;--------------------&quot;)</span><br><span class="line"></span><br><span class="line"></span><br><span class="line">def main():</span><br><span class="line">        filelist &#x3D; os.listdir(dir)      # 获取目录下所有的文件名称</span><br><span class="line">        filelist.sort()                 # 按照数字排序</span><br><span class="line">        </span><br><span class="line">        for filename in filelist:</span><br><span class="line">                process(filename)       # 将文件名传过去</span><br><span class="line"></span><br><span class="line"></span><br><span class="line">if __name__ &#x3D;&#x3D; &quot;__main__&quot;:</span><br><span class="line">        main()</span><br></pre></td></tr></table></figure>
<h4 id="执行脚本"><a href="#执行脚本" class="headerlink" title="执行脚本"></a>执行脚本</h4><figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">[root@dayday ~]# python merge_final.py &gt; lecture.txt</span><br></pre></td></tr></table></figure>

<h4 id="知识点整理"><a href="#知识点整理" class="headerlink" title="知识点整理"></a>知识点整理</h4><p><strong>1. 编码格式问题</strong></p>
<p>在读取中文的情况下，通常会遇到一些编码的问题，但是首先需要了解目前的编码方式是什么，然后再用decode或者encode去编码和解码，下面是使用chardet库来查看编码方式的。</p>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">import chardet</span><br><span class="line"></span><br><span class="line">path &#x3D; &quot;E:&#x2F;t.csv&quot;</span><br><span class="line">f &#x3D; open(path,&#39;rb&#39;)</span><br><span class="line"></span><br><span class="line">data &#x3D; f.read()</span><br><span class="line">print(chardet.detect(data))</span><br></pre></td></tr></table></figure>
<figure class="highlight plain"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line">[root@dayday ~]# python test.py</span><br><span class="line">&#123;&#39;encoding&#39;: &#39;GB2312&#39;, &#39;confidence&#39;: 0.99, &#39;language&#39;: &#39;Chinese&#39;&#125;</span><br></pre></td></tr></table></figure>
<p>当时的字幕是从<a href="http://downsub.com/" target="_blank" rel="noopener">downsub.com</a>网站下载的，编码格式是Windows-1254格式，在Python进行文件操作时总是报错，偷懒在notepad++下手动更改编码格式为utf-8，当然也可以用Python脚本实现。</p>
<p><strong>2. 导入目录下所有文件的名称&amp;排序问题</strong></p>
<p>这个就是知识没掌握的问题了，在此记录。os是个强大的模块！</p>
<p><strong>3. 正则语法</strong></p>
<p>好多语法格式都不熟练，一边调试一边查<a href="runoob.com">runoob.com</a>，着实耗时间，需反复练习！</p>
<p><strong>4. re.match()返回值问题</strong></p>
<p>re.match()匹配成功返回re.match()对象，可用.group()方法提取字符串；匹配失败返回None。</p>
<p>文件时是按行读取的，因此循环对行进行正则匹配，返回的结果中穿插着None和re.match对象，而None调用.group()方法会报错，所以在这简单的用if过滤掉了None。</p>
<p>在敲这篇记录时突然想起，应该可以用Python的异常处理过滤掉报错，改天试试看</p>
<p><strong>5. 养成随手记录的习惯</strong></p>
<p>在整个解决过程中，查了很多资料，但没有随时记下来，解决完问题做记录的时候有些问题都忘记了！</p>

    </div>

    
    
    

      <footer class="post-footer">

        


        
    <div class="post-nav">
      <div class="post-nav-item">
    <a href="/2020/03/06/hello-world/" rel="prev" title="Hello World">
      <i class="fa fa-chevron-left"></i> Hello World
    </a></div>
      <div class="post-nav-item">
    <a href="/2020/03/06/%E5%AE%89%E8%A3%85%E9%BB%91%E8%8B%B9%E6%9E%9C%E5%B0%8F%E8%AE%B0/" rel="next" title="安装黑苹果记录">
      安装黑苹果记录 <i class="fa fa-chevron-right"></i>
    </a></div>
    </div>
      </footer>
    
  </article>
  
  
  

  </div>


          </div>
          
    <div class="comments" id="valine-comments"></div>

<script>
  window.addEventListener('tabs:register', () => {
    let { activeClass } = CONFIG.comments;
    if (CONFIG.comments.storage) {
      activeClass = localStorage.getItem('comments_active') || activeClass;
    }
    if (activeClass) {
      let activeTab = document.querySelector(`a[href="#comment-${activeClass}"]`);
      if (activeTab) {
        activeTab.click();
      }
    }
  });
  if (CONFIG.comments.storage) {
    window.addEventListener('tabs:click', event => {
      if (!event.target.matches('.tabs-comment .tab-content .tab-pane')) return;
      let commentClass = event.target.classList[1];
      localStorage.setItem('comments_active', commentClass);
    });
  }
</script>

        </div>
          
  
  <div class="toggle sidebar-toggle">
    <span class="toggle-line toggle-line-first"></span>
    <span class="toggle-line toggle-line-middle"></span>
    <span class="toggle-line toggle-line-last"></span>
  </div>

  <aside class="sidebar">
    <div class="sidebar-inner">

      <ul class="sidebar-nav motion-element">
        <li class="sidebar-nav-toc">
          文章目录
        </li>
        <li class="sidebar-nav-overview">
          站点概览
        </li>
      </ul>

      <!--noindex-->
      <div class="post-toc-wrap sidebar-panel">
          <div class="post-toc motion-element"><ol class="nav"><li class="nav-item nav-level-3"><a class="nav-link" href="#使用python中的re模块处理YouTube字幕文件"><span class="nav-number">1.</span> <span class="nav-text">使用python中的re模块处理YouTube字幕文件</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#文件命名格式"><span class="nav-number">1.1.</span> <span class="nav-text">文件命名格式</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#文件内容格式"><span class="nav-number">1.2.</span> <span class="nav-text">文件内容格式</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#思路分析"><span class="nav-number">1.3.</span> <span class="nav-text">思路分析</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#实现代码"><span class="nav-number">1.4.</span> <span class="nav-text">实现代码</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#执行脚本"><span class="nav-number">1.5.</span> <span class="nav-text">执行脚本</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#知识点整理"><span class="nav-number">1.6.</span> <span class="nav-text">知识点整理</span></a></li></ol></li></ol></div>
      </div>
      <!--/noindex-->

      <div class="site-overview-wrap sidebar-panel">
        <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
    <img class="site-author-image" itemprop="image" alt="CT"
      src="/images/avatar.gif">
  <p class="site-author-name" itemprop="name">CT</p>
  <div class="site-description" itemprop="description">this is a desc.</div>
</div>
<div class="site-state-wrap motion-element">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
          <a href="/archives/">
        
          <span class="site-state-item-count">16</span>
          <span class="site-state-item-name">日志</span>
        </a>
      </div>
      <div class="site-state-item site-state-tags">
        <span class="site-state-item-count">5</span>
        <span class="site-state-item-name">标签</span>
      </div>
  </nav>
</div>
  <div class="links-of-author motion-element">
      <span class="links-of-author-item">
        <a href="https://github.com/daydaychen" title="GitHub → https:&#x2F;&#x2F;github.com&#x2F;daydaychen" rel="noopener" target="_blank"><i class="fa fa-fw fa-github"></i>GitHub</a>
      </span>
      <span class="links-of-author-item">
        <a href="mailto:ctt0120@foxmail.com" title="E-Mail → mailto:ctt0120@foxmail.com" rel="noopener" target="_blank"><i class="fa fa-fw fa-envelope"></i>E-Mail</a>
      </span>
  </div>
  <div class="cc-license motion-element" itemprop="license">
    <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/deed.zh" class="cc-opacity" rel="noopener" target="_blank"><img src="/images/cc-by-nc-sa.svg" alt="Creative Commons"></a>
  </div>



      </div>
        <div class="back-to-top motion-element">
          <i class="fa fa-arrow-up"></i>
          <span>0%</span>
        </div>

    </div>
  </aside>
  <div id="sidebar-dimmer"></div>


      </div>
    </main>

    <footer class="footer">
      <div class="footer-inner">
        

<div class="copyright">
  
  &copy; 
  <span itemprop="copyrightYear">2020</span>
  <span class="with-love">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">CT</span>
</div>
  <div class="powered-by">由 <a href="https://hexo.io/" class="theme-link" rel="noopener" target="_blank">Hexo</a> 强力驱动 v4.2.0
  </div>
  <span class="post-meta-divider">|</span>
  <div class="theme-info">主题 – <a href="https://theme-next.org/" class="theme-link" rel="noopener" target="_blank">NexT.Gemini</a> v7.7.2
  </div>

        








      </div>
    </footer>
  </div>

  
  <script src="//cdn.jsdelivr.net/npm/animejs@3.1.0/lib/anime.min.js"></script>
  <script src="//cdn.jsdelivr.net/gh/theme-next/theme-next-pjax@0/pjax.min.js"></script>
  <script src="//cdn.jsdelivr.net/npm/medium-zoom@1/dist/medium-zoom.min.js"></script>
  <script src="//cdn.jsdelivr.net/npm/lozad@1/dist/lozad.min.js"></script>
  <script src="//cdn.jsdelivr.net/npm/pangu@4/dist/browser/pangu.min.js"></script>
  <script src="//cdn.jsdelivr.net/npm/velocity-animate@1/velocity.min.js"></script>
  <script src="//cdn.jsdelivr.net/npm/velocity-animate@1/velocity.ui.min.js"></script>

<script src="/js/utils.js"></script>

<script src="/js/motion.js"></script>


<script src="/js/schemes/pisces.js"></script>


<script src="/js/next-boot.js"></script>

  <script>
var pjax = new Pjax({
  selectors: [
    'head title',
    '#page-configurations',
    '.content-wrap',
    '.post-toc-wrap',
    '#pjax'
  ],
  switches: {
    '.post-toc-wrap': Pjax.switches.innerHTML
  },
  analytics: false,
  cacheBust: false,
  scrollTo : !CONFIG.bookmark.enable
});

window.addEventListener('pjax:success', () => {
  document.querySelectorAll('script[pjax], script#page-configurations, #pjax script').forEach(element => {
    var code = element.text || element.textContent || element.innerHTML || '';
    var parent = element.parentNode;
    parent.removeChild(element);
    var script = document.createElement('script');
    if (element.id) {
      script.id = element.id;
    }
    if (element.className) {
      script.className = element.className;
    }
    if (element.type) {
      script.type = element.type;
    }
    if (element.src) {
      script.src = element.src;
      // Force synchronous loading of peripheral JS.
      script.async = false;
    }
    if (element.getAttribute('pjax') !== null) {
      script.setAttribute('pjax', '');
    }
    if (code !== '') {
      script.appendChild(document.createTextNode(code));
    }
    parent.appendChild(script);
  });
  NexT.boot.refresh();
  // Define Motion Sequence & Bootstrap Motion.
  if (CONFIG.motion.enable) {
    NexT.motion.integrator
      .init()
      .add(NexT.motion.middleWares.subMenu)
      .add(NexT.motion.middleWares.postList)
      .bootstrap();
  }
  NexT.utils.updateSidebarPosition();
});
</script>




  




  
<script src="/js/local-search.js"></script>













    <div id="pjax">
  

  


<script>
NexT.utils.loadComments(document.querySelector('#valine-comments'), () => {
  NexT.utils.getScript('//cdn.jsdelivr.net/npm/valine@1/dist/Valine.min.js', () => {
    var GUEST = ['nick', 'mail', 'link'];
    var guest = 'nick,mail,link';
    guest = guest.split(',').filter(item => {
      return GUEST.includes(item);
    });
    new Valine({
      el         : '#valine-comments',
      verify     : false,
      notify     : false,
      appId      : '9bBjpz3HRuKNuKNp7T27dFsT-gzGzoHsz',
      appKey     : 'k2pzqGWjkjrKxlFLQ3O0oqDn',
      placeholder: "Just go go",
      avatar     : 'robohash',
      meta       : guest,
      pageSize   : '10' || 10,
      visitor    : true,
      lang       : 'zh-cn' || 'zh-cn',
      path       : location.pathname,
      recordIP   : false,
      serverURLs : ''
    });
  }, window.Valine);
});
</script>

    </div>
</body>
</html>
